set.seed(2)
required_packages <- c("tidyverse", "magrittr", "DBI", "bigrquery", "arrow","glue", "vroom","janitor", "gt", "ggwordcloud", "readxl", "ggthemes", "hrbrthemes", "extrafont", "plotly", "scales", "stringr", "gganimate", "here", "tidytext", "sentimentr", "scales", "DT", "here", "sm", "mblm", "glue", "fs", "knitr", "rmdformats", "janitor", "urltools", "colorspace", "pdftools", "showtext", "pander", "ggridges")
for(i in required_packages) {
if(!require(i, character.only = T)) {
# if package is not existing, install then load the package
install.packages(i, dependencies = T)
require(i, character.only = T)
}
}
panderOptions('table.alignment.default', "left")
## quality of png's
dpi <- 750
## theme updates; please adjust to client´s website
#theme_set(ggthemes::theme_clean(base_size = 15))
theme_set(ggthemes::theme_clean(base_size = 15, base_family = "Montserrat"))
theme_update(plot.margin = margin(30, 30, 30, 30),
plot.background = element_rect(color = "white",
fill = "white"),
plot.title = element_text(size = 20,
face = "bold",
lineheight = 1.05,
hjust = .5,
margin = margin(10, 0, 25, 0)),
plot.title.position = "plot",
plot.caption = element_text(color = "grey40",
size = 9,
margin = margin(20, 0, -20, 0)),
plot.caption.position = "plot",
axis.line.x = element_line(color = "black",
size = .8),
axis.line.y = element_line(color = "black",
size = .8),
axis.title.x = element_text(size = 16,
face = "bold",
margin = margin(t = 20)),
axis.title.y = element_text(size = 16,
face = "bold",
margin = margin(r = 20)),
axis.text = element_text(size = 11,
color = "black",
face = "bold"),
axis.text.x = element_text(margin = margin(t = 10)),
axis.text.y = element_text(margin = margin(r = 10)),
axis.ticks = element_blank(),
panel.grid.major.x = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.major.y = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.minor.x = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.minor.y = element_blank(),
panel.spacing.x = unit(4, "lines"),
panel.spacing.y = unit(2, "lines"),
legend.position = "top",
legend.title = element_text(family = "Montserrat",
color = "black",
size = 14,
margin = margin(5, 0, 5, 0)),
legend.text = element_text(family = "Montserrat",
color = "black",
size = 11,
margin = margin(4.5, 4.5, 4.5, 4.5)),
legend.background = element_rect(fill = NA,
color = NA),
legend.key = element_rect(color = NA, fill = NA),
#legend.key.width = unit(5, "lines"),
#legend.spacing.x = unit(.05, "pt"),
#legend.spacing.y = unit(.55, "pt"),
#legend.margin = margin(0, 0, 10, 0),
strip.text = element_text(face = "bold",
margin = margin(b = 10)))
## theme settings for flipped plots
theme_flip <-
theme(panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_line(size = .6,
color = "#eaeaea"))
## theme settings for maps
theme_map <-
theme_void(base_family = "Montserrat") +
theme(legend.direction = "horizontal",
legend.box = "horizontal",
legend.margin = margin(10, 10, 10, 10),
legend.title = element_text(size = 17,
face = "bold"),
legend.text = element_text(color = "grey33",
size = 12),
plot.margin = margin(15, 5, 15, 5),
plot.title = element_text(face = "bold",
size = 20,
hjust = .5,
margin = margin(30, 0, 10, 0)),
plot.subtitle = element_text(face = "bold",
color = "grey33",
size = 17,
hjust = .5,
margin = margin(10, 0, -30, 0)),
plot.caption = element_text(size = 14,
color = "grey33",
hjust = .97,
margin = margin(-30, 0, 0, 0)))
## numeric format for labels
num_format <- scales::format_format(big.mark = ",", small.mark = ",", scientific = F)
## main color backlinko
bl_col <- "#00d188"
bl_dark <- darken(bl_col, .3, space = "HLS")
## colors + labels for interval stripes
int_cols <- c("#bce2d5", "#79d8b6", bl_col, "#009f66", "#006c45", "#003925")
int_perc <- c("100%", "95%", "75%", "50%", "25%", "5%")
## colors for degrees (Bachelors, Massters, Doctorate in reverse order)
cols_degree <- c("#e64500", "#FFCC00", darken(bl_col, .1))
## gradient colors for position
colfunc <- colorRampPalette(c(bl_col, "#bce2d5"))
pos_cols <- colfunc(10)df <- bind_rows(
pmap_df(list(0:99), ~read_csv(glue("../proc_data/ahref/export_keywords_100_to_500/{.x}.csv"))) %>%
clean_names() %>% mutate(cat = "100-500"),
pmap_df(list(0:99), ~read_csv(glue("../proc_data/ahref/export_keywords_500_to_1000/{.x}.csv"))) %>%
clean_names() %>% mutate(cat = "500-1000"),
pmap_df(list(0:49), ~read_csv(glue("../proc_data/ahref/export_keywords_1000_to_10000/{.x}.csv"))) %>%
clean_names() %>% mutate(cat = "1000-10000"),
pmap_df(list(0:33), ~read_csv(glue("../proc_data/ahref/export_keywords_10000_to_1000000000/{.x}.csv"))) %>%
clean_names() %>% mutate(cat = "10000+")
)
df %<>% mutate(
difficulty_cat = case_when(
difficulty <= 10 ~ "Easy",
between(difficulty, 11, 30) ~ "Medium",
between(difficulty, 31, 70) ~ "Hard",
between(difficulty, 71, 100) ~ "Super hard"
))
dfs <- df %>% sample_n(20000)
tibble("Number of rows" = format(df %>% nrow(), big.mark = ",")) %>%
pander()| Number of rows |
|---|
| 2,507,759 |
The volume looks weird. It doesn’t follow the initial categories:
df %>% drop_na(volume) %>%
group_by(cat) %>%
summarise(
n = n(),
mean = mean(volume),
median = median(volume),
max = max(volume),
min = min(volume)
) %>%
arrange(median) %>%
pander()| cat | n | mean | median | max | min |
|---|---|---|---|---|---|
| 100-500 | 714519 | 291.3 | 100 | 7330000 | 10 |
| 500-1000 | 776898 | 407.7 | 200 | 5750000 | 10 |
| 1000-10000 | 402317 | 1064 | 300 | 5310000 | 10 |
| 10000+ | 272614 | 13835 | 800 | 45260000 | 10 |
Let’s look at some of the searches that had a low search volume in the initial data set:
df %>% filter(cat == "100-500", volume > 10000) %>%
select(keyword, volume) %>%
head(5) %>%
pander()| keyword | volume |
|---|---|
| new orleans | 336000 |
| beauty | 135000 |
| mine | 84000 |
| stupid | 83000 |
| newport news | 32000 |
This doesn’t looks like low volume words. And indeed they were not in the original data sets for 100-500 volume. I’m not sure how they came in.
This brings up the issue of upscaling.
For example, let’s say we are interested in the average keyword difficulty. We could simply take the average keyword difficulty of all the keywords in these exported_keyword datasets. But the representation in this data set is skewed, so that the keywords with high search volume are vastly over-represented, compared to the list of words in the original data set.
A more correct approach would be to weight them by how large a pool they are a selection from. Eg, if there are 100 keywords that are not included for each keyword in the 100-500 volume category, but only 1 keyword that was not included for each keyword in the 10000+ volume category, they would be assigned accordingly.
In fact, let’s see:
length_keyword_files <- function(min, max){
sql <- glue("SELECT count(*) as `count`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND keyword_info_search_volume >= {min}
AND keyword_info_search_volume < {max}")
tb <- bq_project_query("dataforseo-bigquery", sql)
df <- bq_table_download(tb) %>% mutate(min = min, max = max)
}
scaling <- map2_df(c(100, 500, 1000, 10000), c(500, 1000, 10000, 1000000000), length_keyword_files) %>%
mutate(factor = count / 1000000) %>% relocate(min, max)
scaling %>% pander()| min | max | count | factor |
|---|---|---|---|
| 100 | 500 | 33059825 | 33.06 |
| 500 | 1000 | 5766061 | 5.766 |
| 1000 | 10000 | 8449417 | 8.449 |
| 10000 | 1e+09 | 1284194 | 1.284 |
For now I will just use this list as if its truth and not adjust for volume. This can be changed later.
tribble(~Mean, ~Median,
round(mean(df$difficulty, na.rm = T), 2), median(df$difficulty, na.rm = T)) %>%
pander()| Mean | Median |
|---|---|
| 14.33 | 6 |
dfs %>%
ggplot(aes(x = difficulty, y = volume)) +
geom_jitter(size = 0.1, alpha = 0.1, height = 0.08, width = 0.3) +
scale_y_log10(labels = comma) +
geom_smooth(method='loess', formula= y~x) +
labs(title = "Keyword difficulty and volume")dfs %>% drop_na(difficulty_cat) %>%
ggplot(aes(y = volume, x = difficulty_cat)) +
geom_violin(draw_quantiles = c(0.5)) +
scale_y_log10(labels = comma) +
labs(x = "Difficulty category", title = "Keyword difficulty and volume")dfs %>%
ggplot(aes(x = difficulty, y = cpc)) +
geom_jitter(size = 0.1, alpha = 0.1, height = 0.08, width = 0.3) +
scale_y_log10(labels = comma) +
geom_smooth(method='loess', formula= y~x) +
labs(title = "Keyword difficulty and cpc")dfs %>% drop_na(difficulty_cat) %>%
ggplot(aes(y = cpc, x = difficulty_cat)) +
geom_violin(draw_quantiles = c(0.5)) +
scale_y_log10(labels = comma) +
labs(x = "Difficulty category", title = "Keyword difficulty and cpc")dff <- dfs %>%
select(keyword, volume, clicks, cpc, serp_features, cps) %>%
separate_rows(serp_features, sep = ",") %>%
mutate(serp_features = ifelse(is.na(serp_features), "(None)", serp_features))
nones <- dff %>% filter(serp_features == "(None)")
dffn <- dff %>% group_by(keyword) %>%
summarise(n_serp = n()) %>%
mutate(n_serp = ifelse(keyword %in% nones$keyword, 0, n_serp)) %>%
mutate(n_serp = ifelse(n_serp >= 6, "6+", as.character(n_serp))) %>%
mutate(n_serp = factor(n_serp, levels = c("0", "1", "2", "3", "4", "5", "6+")))
dffn <- left_join(dffn, dfs, by = "keyword")dff %>% group_by(serp_features) %>%
summarise(prop = n() / nrow(dfs)) %>%
ggplot(aes(y = reorder(serp_features, prop), x = prop)) +
geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
scale_x_continuous(labels = scales::percent) +
labs(y = "SERP feature", x = "", title = "Presence of SERP features")dffn %>% group_by(n_serp) %>%
summarise(n = n() / nrow(dffn)) %>%
ggplot(aes(x = n_serp, y = n)) +
geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
labs(x = "Number of serp features", y = "", title = "Distribution of SERP features") +
scale_y_continuous(labels = scales::percent)order <- dff %>%
group_by(serp_features) %>%
summarise(volume = mean(volume, na.rm = T)) %>%
arrange(volume) %>%
pull(serp_features)
dff %>%
drop_na(serp_features) %>%
mutate(serp_features = factor(serp_features, levels = order)) %>%
ggplot(aes(y = serp_features, x = volume)) +
stat_density_ridges(fill = "turquoise4", color = "black") +
scale_x_log10(limits = c(10, 50000)) +
labs(title = "SERP features and mean volume", x = "Mean volume")dffn %>% group_by(n_serp) %>%
summarise(volume = mean(volume, na.rm = T)) %>%
ggplot(aes(x = n_serp, y = volume)) +
geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
labs(x = "Number of serp features", title = "Number of SERP features and mean volume", y = "Mean volume")dffn %>%
ggplot(aes(y = n_serp, x = volume)) +
stat_density_ridges(fill = "turquoise4", color = "black") +
scale_x_log10(labels = comma) +
labs(y = "SERP features", title = "Number of SERP features and volume")order <- dff %>%
group_by(serp_features) %>%
summarise(mean_clicks = mean(clicks, na.rm = T)) %>%
arrange(mean_clicks) %>%
pull(serp_features)
dff %>%
mutate(serp_features = factor(serp_features, levels = order)) %>%
mutate(clicks = clicks + 1) %>%
ggplot(aes(y = serp_features, x = clicks)) +
stat_density_ridges(fill = "turquoise4", color = "black") +
scale_x_log10(labels = comma)dffn %>% group_by(n_serp) %>%
summarise(clicks = mean(clicks, na.rm = T)) %>%
ggplot(aes(x = n_serp, y = clicks)) +
geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
labs(x = "SERP features", title = "Number of SERP features and mean clicks", y = "Mean clicks")dffn %>%
mutate(clicks = clicks + 1) %>%
ggplot(aes(y = n_serp, x = clicks)) +
stat_density_ridges(fill = "turquoise4", color = "black") +
scale_x_log10(labels = comma) +
labs(y = "SERP features", title = "Number of SERP features and clicks")dfv <-
bind_rows(
df %>% mutate(region = "US"),
df %>% mutate(volume = global_volume - volume, region = "International")
)International searches have much more total volume:
dfv %>% group_by(region) %>%
summarise(volume = format(sum(volume, na.rm = T), big.mark = ",")) %>%
pander()| region | volume |
|---|---|
| International | 10,042,589,870 |
| US | 4,724,571,610 |
Internationally there are more searches with very low volume, while US has more searches with medium volume.
dfv %>% drop_na(volume) %>%
mutate(volume_group = case_when(volume < 100 ~ "< 100",
between(volume, 100, 1000) ~ "100 - 1000",
between(volume, 1000, 10000) ~ "1000 - 10,000",
volume > 10000 ~ "10,000 +")) %>%
mutate(volume_group = factor(volume_group, levels = c("< 100", "100 - 1000", "1000 - 10,000", "10,000 +"))) %>%
group_by(volume_group, region) %>%
summarise(n = n()) %>%
ggplot(aes(x = volume_group, y = n, fill = region)) +
geom_bar(stat = "identity", position = position_dodge(), width = 0.8) There is not a large difference in the number of searches with very high volume. However, the total volume of these searches is a lot higher internationally
dfv %>% drop_na(volume) %>%
mutate(volume_group = case_when(volume < 100 ~ "< 100",
between(volume, 100, 1000) ~ "100 - 1000",
between(volume, 1000, 10000) ~ "1000 - 10,000",
between(volume, 10000, 100000) ~ "10,000 - 100,000",
between(volume, 100000, 1000000) ~ "100,000 - 1M",
volume > 1000000 ~ "1M +")) %>%
mutate(volume_group = factor(volume_group, levels = c("< 100", "100 - 1000", "1000 - 10,000", "10,000 - 100,000", "100,000 - 1M", "1M +"))) %>%
group_by(volume_group, region) %>%
summarise(n = sum(volume)) %>%
ggplot(aes(x = volume_group, y = n, fill = region)) +
geom_bar(stat = "identity", position = position_dodge(), width = 0.8)Looking at the number of searches with above 1 million volume:
df_int <- df %>% mutate(international_volume = global_volume - volume) %>%
filter(international_volume > 0, global_volume > 0) %>%
mutate(volume_diff = log10(international_volume) - log10(volume))
tibble(International = df_int %>% filter(international_volume > 1000000) %>% nrow(),
US = df_int %>% filter(volume > 1000000) %>% nrow()) %>%
pander()| International | US |
|---|---|
| 1139 | 237 |
df_int_s <- df_int %>% sample_n(200000)
df_int_s %>%
ggplot(aes(x = volume, y = international_volume)) +
geom_jitter(size = 0.05, alpha = 0.02, height = 0.15, width = 0.15) +
scale_x_log10(labels = comma, expand = c(0,0)) +
scale_y_log10(labels = comma, expand = c(0,0)) +
geom_abline(intercept = 0, slope = 1, color = "turquoise4", size = 1) +
labs(x = "US volume", y = "International volume")We can see that they mostly follow each other, but there are some searches with large difference between them.
Higher volume internationally:
df_int %>%
arrange(desc(volume_diff)) %>%
filter(volume != 60) %>%
select(keyword, us_volume = volume, international_volume) %>%
head(5) %>%
pander()| keyword | us_volume | international_volume |
|---|---|---|
| filmoviplex | 10 | 295990 |
| cloroquina | 200 | 5869800 |
| parivahan sewa | 10 | 276990 |
| jokaroom | 10 | 173990 |
| handball em | 20 | 327980 |
Higher volume in US:
df_int %>%
arrange(volume_diff) %>%
select(keyword, us_volume = volume, international_volume) %>%
head(5) %>%
pander()| keyword | us_volume | international_volume |
|---|---|---|
| football playoff schedule | 602000 | 1000 |
| frontier mail | 586000 | 1000 |
| spectrum mobile | 526000 | 1000 |
| chase bank near me | 523000 | 1000 |
| spectrum internet | 998000 | 2000 |
Searches that have higher volume in US have a higher click-per-search on average than searches that have higher volume internationally.
df_int_s %>%
ggplot(aes(x = volume_diff, y = cps)) +
geom_point(alpha = 0.08, size = 0.08) +
#scale_y_log10(labels = comma) +
geom_smooth(method='lm', formula= y~x) +
scale_x_continuous(breaks = c(-2, 0, 3), labels = c("More US", "0", "More international")) +
labs(x = "")
They also have a higher cost-per-click on average
df_int_s %>%
ggplot(aes(x = volume_diff, y = cpc)) +
geom_point(alpha = 0.08, size = 0.08) +
scale_y_log10(labels = comma) +
geom_smooth(method='lm', formula= y~x) +
scale_x_continuous(breaks = c(-2, 0, 3), labels = c("More US", "0", "More international")) +
labs(x = "")
Searches that have higher volume internationally, tend to have higher difficulty
df_int_s %>%
ggplot(aes(x = volume_diff, y = difficulty)) +
geom_point(alpha = 0.08, size = 0.08) +
#scale_y_log10(labels = comma) +
geom_smooth(method='lm', formula= y~x) +
scale_x_continuous(breaks = c(-2, 0, 3), labels = c("More US", "0", "More international")) +
labs(x = "")tribble(~Mean, ~Median,
round(mean(df$clicks, na.rm = T), 2), median(df$clicks, na.rm = T)) %>%
pander()| Mean | Median |
|---|---|
| 3125 | 307 |
dfs %>% ggplot(aes(x = clicks)) +
geom_histogram(fill = "turquoise4", color = "black") +
scale_x_log10(labels = comma) +
labs(title = "Distribution of number of clicks per search", y = "", x = "")!!!J: I don’t think there is any data in the file about organic / paid?